In [1]:
#currently running directatory
import  os
os.getcwd()
Out[1]:
'C:\\Users\\ADMIN'
In [2]:
#Importing all important package's for predictiond and calculation.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
In [3]:
#Importing the file to test and run the functions.
data= pd.read_excel('C:\\Users\\ADMIN\\1788408-1767133-1729258-1613615-Stock_Price_data_set.xlsx')
In [4]:
#Reading of the data.
data
Out[4]:
Date Open High Low Close Adj Close Volume
0 2018-02-05 262.000000 267.899994 250.029999 254.259995 254.259995 11896100.0
1 2018-02-06 247.699997 266.700012 245.000000 265.720001 265.720001 12595800.0
2 2018-02-07 266.579987 272.450012 264.329987 264.559998 264.559998 8981500.0
3 2018-02-08 267.079987 267.619995 250.000000 250.100006 250.100006 9306700.0
4 2018-02-09 253.850006 255.800003 236.110001 249.470001 249.470001 16906900.0
... ... ... ... ... ... ... ...
1004 2022-01-31 401.970001 427.700012 398.200012 427.140015 427.140015 20047500.0
1005 2022-02-01 432.959991 458.480011 425.540009 457.130005 457.130005 22542300.0
1006 2022-02-02 448.250000 451.980011 426.480011 429.480011 429.480011 14346000.0
1007 2022-02-03 421.440002 429.260010 404.279999 405.600006 405.600006 9905200.0
1008 2022-02-04 407.309998 412.769989 396.640015 410.170013 410.170013 7782400.0

1009 rows × 7 columns

In [5]:
#Reading the first 5 rows.
data.head()
Out[5]:
Date Open High Low Close Adj Close Volume
0 2018-02-05 262.000000 267.899994 250.029999 254.259995 254.259995 11896100.0
1 2018-02-06 247.699997 266.700012 245.000000 265.720001 265.720001 12595800.0
2 2018-02-07 266.579987 272.450012 264.329987 264.559998 264.559998 8981500.0
3 2018-02-08 267.079987 267.619995 250.000000 250.100006 250.100006 9306700.0
4 2018-02-09 253.850006 255.800003 236.110001 249.470001 249.470001 16906900.0
In [6]:
#Reading the last 5 rows.
data.tail()
Out[6]:
Date Open High Low Close Adj Close Volume
1004 2022-01-31 401.970001 427.700012 398.200012 427.140015 427.140015 20047500.0
1005 2022-02-01 432.959991 458.480011 425.540009 457.130005 457.130005 22542300.0
1006 2022-02-02 448.250000 451.980011 426.480011 429.480011 429.480011 14346000.0
1007 2022-02-03 421.440002 429.260010 404.279999 405.600006 405.600006 9905200.0
1008 2022-02-04 407.309998 412.769989 396.640015 410.170013 410.170013 7782400.0
In [7]:
#Describtion of the data.
data.describe
Out[7]:
<bound method NDFrame.describe of            Date        Open        High         Low       Close   Adj Close  \
0    2018-02-05  262.000000  267.899994  250.029999  254.259995  254.259995   
1    2018-02-06  247.699997  266.700012  245.000000  265.720001  265.720001   
2    2018-02-07  266.579987  272.450012  264.329987  264.559998  264.559998   
3    2018-02-08  267.079987  267.619995  250.000000  250.100006  250.100006   
4    2018-02-09  253.850006  255.800003  236.110001  249.470001  249.470001   
...         ...         ...         ...         ...         ...         ...   
1004 2022-01-31  401.970001  427.700012  398.200012  427.140015  427.140015   
1005 2022-02-01  432.959991  458.480011  425.540009  457.130005  457.130005   
1006 2022-02-02  448.250000  451.980011  426.480011  429.480011  429.480011   
1007 2022-02-03  421.440002  429.260010  404.279999  405.600006  405.600006   
1008 2022-02-04  407.309998  412.769989  396.640015  410.170013  410.170013   

          Volume  
0     11896100.0  
1     12595800.0  
2      8981500.0  
3      9306700.0  
4     16906900.0  
...          ...  
1004  20047500.0  
1005  22542300.0  
1006  14346000.0  
1007   9905200.0  
1008   7782400.0  

[1009 rows x 7 columns]>
In [8]:
#reading the title or the heading of the data.
data.count()
Out[8]:
Date         1009
Open         1009
High         1009
Low          1009
Close        1009
Adj Close    1009
Volume       1009
dtype: int64
In [9]:
data.index
Out[9]:
RangeIndex(start=0, stop=1009, step=1)
In [10]:
data.columns
Out[10]:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')
In [11]:
#To show the completely data set info/describtion.
data.describe(include = 'all')
C:\Users\ADMIN\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.
  """Entry point for launching an IPython kernel.
Out[11]:
Date Open High Low Close Adj Close Volume
count 1009 1009.000000 1009.000000 1009.000000 1009.000000 1009.000000 1.009000e+03
unique 1009 NaN NaN NaN NaN NaN NaN
top 2018-02-05 00:00:00 NaN NaN NaN NaN NaN NaN
freq 1 NaN NaN NaN NaN NaN NaN
first 2018-02-05 00:00:00 NaN NaN NaN NaN NaN NaN
last 2022-02-04 00:00:00 NaN NaN NaN NaN NaN NaN
mean NaN 419.059673 425.320703 412.374044 419.000733 419.000733 7.570685e+06
std NaN 108.537532 109.262960 107.555867 108.289999 108.289999 5.465535e+06
min NaN 233.919998 250.649994 231.229996 233.880005 233.880005 1.144000e+06
25% NaN 331.489990 336.299988 326.000000 331.619995 331.619995 4.091900e+06
50% NaN 377.769989 383.010010 370.880005 378.670013 378.670013 5.934500e+06
75% NaN 509.130005 515.630005 502.529999 509.079987 509.079987 9.322400e+06
max NaN 692.349976 700.989990 686.090027 691.690002 691.690002 5.890430e+07
In [12]:
#To show the datatypes of the data.
data.dtypes
Out[12]:
Date         datetime64[ns]
Open                float64
High                float64
Low                 float64
Close               float64
Adj Close           float64
Volume              float64
dtype: object
In [13]:
#To see the null value of the data.
data.isna().sum()
Out[13]:
Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64
In [14]:
#To see the null value in bool form.
data.isna().any()
Out[14]:
Date         False
Open         False
High         False
Low          False
Close        False
Adj Close    False
Volume       False
dtype: bool
In [15]:
#To clean all the null value or to see the null values.
data.isnull().sum().sum()
Out[15]:
0
In [16]:
data['Date'] = pd.to_datetime(data.Date)
In [17]:
data.drop('Adj Close',axis =1, inplace = True)
In [18]:
print(len(data))
1009
In [19]:
data['Open'].plot(figsize=(16,6))
Out[19]:
<AxesSubplot:>
In [20]:
# Correlation matrix
def plot_corr_matrix(data, g_width):
    file_name = datd.dataframeName
    data = data.dropna('columns') # drop columns with NaN
    data = data[[col for col in data if data[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if data.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({data.shape[1]}) is less than 2')
        return
    corr = data.corr()
    plt.figure(num=None, figsize=(g_width, g_width), dpi=80, facecolor='w', edgecolor='k')
    corr_matrix = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corr_matrix)
    plt.title(f'Correlation Matrix for {file_name}', fontsize=15)
    plt.show()
In [21]:
# Scatter and density plots
def plot_scatter_mat(data, plot_size, text_size):
    data = data.select_dtypes(include =[np.number]) # keep only numerical columns
    ## drop nan values
    data = data.dropna('columns')
    data = data[[col for col in data if data[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    column_names = list(data)
    if len(column_names) > 10:
        column_names = column_names[:10]
    data = data[column_names]
    ax = pd.plotting.scatter_matrix(data, alpha=0.75, figsize=[plot_size, plot_size], diagonal='kde')
    corr = data.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corr[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=text_size)
    plt.suptitle('Scatter and Density Plot')
    plt.show()
In [22]:
plot_scatter_mat(data, 25, 20)
C:\Users\ADMIN\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: FutureWarning: In a future version of pandas all arguments of DataFrame.dropna will be keyword-only
  """
In [23]:
corr = data.corr()
In [24]:
plt.figure(num=25, figsize=(35, 45), dpi=80, facecolor='w', edgecolor='k')
Out[24]:
<Figure size 2800x3600 with 0 Axes>
<Figure size 2800x3600 with 0 Axes>
In [25]:
plt.figure(num=25, figsize=(135, 145), dpi=80, facecolor='w', edgecolor='k')
corr_matrix = plt.matshow(corr, fignum = 1)
<Figure size 10800x11600 with 0 Axes>
In [26]:
corr = data.corr()
plt.figure(num= 25, figsize=(25, 20), dpi=80, facecolor='w', edgecolor='k')
corr_matrix = plt.matshow(corr, fignum = 1)
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.gca().xaxis.tick_bottom()
plt.colorbar(corr_matrix)
plt.title(f'Correlation Matrix for {data}', fontsize=10)
plt.show()
<Figure size 2000x1600 with 0 Axes>
In [27]:
x = data[['Open','High','Low','Volume']]
y = data['Close']
In [28]:
#To test and train the model.
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x ,y, random_state = 0)
In [29]:
x_train.shape
Out[29]:
(756, 4)
In [30]:
x_test.shape
Out[30]:
(253, 4)
In [31]:
y_train.shape
Out[31]:
(756,)
In [32]:
y_test.shape
Out[32]:
(253,)
In [33]:
#Ploting the graph of complete dataset.
import plotly.graph_objects as go
import plotly.express as px
In [34]:
figure = px.bar(data, x = 'Date', y = 'Close')
figure.show()

#You can expand the graph to see in more detailed valued of stock price
In [35]:
figure = px.line(data, x='Date' , y='Close', title='Stock price with rangeslider')
figure.update_xaxes(rangeslider_visible=True)
figure.show()
In [36]:
#Ployting all the bar graphs from date till volume of the stock price insteading of doing sperately each graph name is mentioned below it..
plt.figure(figsize=(15,10))
sns.histplot(data = data,  x = 'Date', kde = True)
plt.figure(figsize=(15,10))
sns.histplot(data = data,  x = 'Open', kde = True)
plt.figure(figsize=(15,10))
sns.histplot(data = data,  x = 'High', kde = True)
plt.figure(figsize=(15,10))
sns.histplot(data = data,  x = 'Low', kde = True)
plt.figure(figsize=(15,10))
sns.histplot(data = data,  x = 'Close', kde = True)
plt.figure(figsize=(15,10))
sns.histplot(data = data,  x = 'Volume', kde = True)
Out[36]:
<AxesSubplot:xlabel='Volume', ylabel='Count'>
In [37]:
#To calculate the data using Linear Regression model.
regressor.fit(x_train,y_train)
Out[37]:
LinearRegression()
In [38]:
print(regressor.coef_)
[-5.98637669e-01  7.42752459e-01  8.57948723e-01  9.68159262e-08]
In [39]:
print(regressor.intercept_)
-0.7077595574160114
In [40]:
#Prediction of the data.
predicted=regressor.predict(x_test)
In [41]:
print(x_test)
            Open        High         Low      Volume
801   557.000000  559.750000  550.299988   2720300.0
311   378.000000  383.500000  374.510010   5398200.0
85    368.540009  368.700012  357.799988   8278000.0
435   278.049988  285.750000  277.350006   6248400.0
204   260.549988  266.250000  253.800003  12498600.0
...          ...         ...         ...         ...
583   418.829987  426.720001  415.980011   3743700.0
200   283.790009  285.089996  269.149994  12993800.0
767   525.000000  548.539978  518.280029   4136500.0
1000  379.140015  387.709991  365.130005  15145800.0
385   298.859985  303.549988  296.269989   6905800.0

[253 rows x 4 columns]
In [42]:
predicted.shape
Out[42]:
(253,)
In [43]:
dframe=pd.DataFrame(y_test,predicted)
In [44]:
dfr=pd.DataFrame({'Actual':y_test,'Predicted':predicted})
In [45]:
print(dfr)
          Actual   Predicted
801   553.729980  553.999288
311   379.059998  379.685786
85    361.399994  360.298634
435   281.859985  283.639587
204   261.429993  260.032497
...          ...         ...
583   425.920013  422.764132
200   270.600006  273.331047
767   546.150024  537.495050
1000  366.420013  375.026471
385   302.799988  300.698946

[253 rows x 2 columns]
In [46]:
dfr.head(10)
Out[46]:
Actual Predicted
801 553.729980 553.999288
311 379.059998 379.685786
85 361.399994 360.298634
435 281.859985 283.639587
204 261.429993 260.032497
590 434.480011 434.730957
1 265.720001 260.518692
780 518.020020 518.570772
457 315.929993 315.400869
299 348.869995 345.455041
In [47]:
regressor.score(x_test,y_test)
Out[47]:
0.9982601041694543
In [48]:
import math
In [49]:
print('Mean Absolute Error:',metrics.mean_absolute_error(y_test,predicted))
Mean Absolute Error: 3.124088127372266
In [50]:
print('Mean Squared Error:',metrics.mean_squared_error(y_test,predicted))
Mean Squared Error: 19.16361234146909
In [51]:
print('Root Mean Squared Error:',math.sqrt(metrics.mean_squared_error(y_test,predicted)))
Root Mean Squared Error: 4.377626336437258
In [52]:
graph=dfr.head(30)
#We can't plot all the rows and columns graph so we choose random 30 numbers to plot graph.
In [53]:
graph.plot(kind='bar')
Out[53]:
<AxesSubplot:>
In [ ]: